{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PySpark\n",
    "\n",
    "This notebook goes over how to load data from a [PySpark](https://spark.apache.org/docs/latest/api/python/) DataFrame."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install --upgrade --quiet  pyspark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
      "23/05/31 14:08:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
     ]
    }
   ],
   "source": [
    "spark = SparkSession.builder.getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.read.csv(\"example_data/mlb_teams_2012.csv\", header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.document_loaders import PySparkDataFrameLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "loader = PySparkDataFrameLoader(spark, df, page_content_column=\"Team\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Stage 8:>                                                          (0 + 1) / 1]\r"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[Document(page_content='Nationals', metadata={' \"Payroll (millions)\"': '     81.34', ' \"Wins\"': ' 98'}),\n",
       " Document(page_content='Reds', metadata={' \"Payroll (millions)\"': '          82.20', ' \"Wins\"': ' 97'}),\n",
       " Document(page_content='Yankees', metadata={' \"Payroll (millions)\"': '      197.96', ' \"Wins\"': ' 95'}),\n",
       " Document(page_content='Giants', metadata={' \"Payroll (millions)\"': '       117.62', ' \"Wins\"': ' 94'}),\n",
       " Document(page_content='Braves', metadata={' \"Payroll (millions)\"': '        83.31', ' \"Wins\"': ' 94'}),\n",
       " Document(page_content='Athletics', metadata={' \"Payroll (millions)\"': '     55.37', ' \"Wins\"': ' 94'}),\n",
       " Document(page_content='Rangers', metadata={' \"Payroll (millions)\"': '      120.51', ' \"Wins\"': ' 93'}),\n",
       " Document(page_content='Orioles', metadata={' \"Payroll (millions)\"': '       81.43', ' \"Wins\"': ' 93'}),\n",
       " Document(page_content='Rays', metadata={' \"Payroll (millions)\"': '          64.17', ' \"Wins\"': ' 90'}),\n",
       " Document(page_content='Angels', metadata={' \"Payroll (millions)\"': '       154.49', ' \"Wins\"': ' 89'}),\n",
       " Document(page_content='Tigers', metadata={' \"Payroll (millions)\"': '       132.30', ' \"Wins\"': ' 88'}),\n",
       " Document(page_content='Cardinals', metadata={' \"Payroll (millions)\"': '    110.30', ' \"Wins\"': ' 88'}),\n",
       " Document(page_content='Dodgers', metadata={' \"Payroll (millions)\"': '       95.14', ' \"Wins\"': ' 86'}),\n",
       " Document(page_content='White Sox', metadata={' \"Payroll (millions)\"': '     96.92', ' \"Wins\"': ' 85'}),\n",
       " Document(page_content='Brewers', metadata={' \"Payroll (millions)\"': '       97.65', ' \"Wins\"': ' 83'}),\n",
       " Document(page_content='Phillies', metadata={' \"Payroll (millions)\"': '     174.54', ' \"Wins\"': ' 81'}),\n",
       " Document(page_content='Diamondbacks', metadata={' \"Payroll (millions)\"': '  74.28', ' \"Wins\"': ' 81'}),\n",
       " Document(page_content='Pirates', metadata={' \"Payroll (millions)\"': '       63.43', ' \"Wins\"': ' 79'}),\n",
       " Document(page_content='Padres', metadata={' \"Payroll (millions)\"': '        55.24', ' \"Wins\"': ' 76'}),\n",
       " Document(page_content='Mariners', metadata={' \"Payroll (millions)\"': '      81.97', ' \"Wins\"': ' 75'}),\n",
       " Document(page_content='Mets', metadata={' \"Payroll (millions)\"': '          93.35', ' \"Wins\"': ' 74'}),\n",
       " Document(page_content='Blue Jays', metadata={' \"Payroll (millions)\"': '     75.48', ' \"Wins\"': ' 73'}),\n",
       " Document(page_content='Royals', metadata={' \"Payroll (millions)\"': '        60.91', ' \"Wins\"': ' 72'}),\n",
       " Document(page_content='Marlins', metadata={' \"Payroll (millions)\"': '      118.07', ' \"Wins\"': ' 69'}),\n",
       " Document(page_content='Red Sox', metadata={' \"Payroll (millions)\"': '      173.18', ' \"Wins\"': ' 69'}),\n",
       " Document(page_content='Indians', metadata={' \"Payroll (millions)\"': '       78.43', ' \"Wins\"': ' 68'}),\n",
       " Document(page_content='Twins', metadata={' \"Payroll (millions)\"': '         94.08', ' \"Wins\"': ' 66'}),\n",
       " Document(page_content='Rockies', metadata={' \"Payroll (millions)\"': '       78.06', ' \"Wins\"': ' 64'}),\n",
       " Document(page_content='Cubs', metadata={' \"Payroll (millions)\"': '          88.19', ' \"Wins\"': ' 61'}),\n",
       " Document(page_content='Astros', metadata={' \"Payroll (millions)\"': '        60.65', ' \"Wins\"': ' 55'})]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loader.load()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
